import pandas as pd
df = pd.read_csv('salaries_by_college_major.csv')
df.head()
| Undergraduate Major | Starting Median Salary | Mid-Career Median Salary | Mid-Career 10th Percentile Salary | Mid-Career 90th Percentile Salary | Group | |
|---|---|---|---|---|---|---|
| 0 | Accounting | 46000.0 | 77100.0 | 42200.0 | 152000.0 | Business |
| 1 | Aerospace Engineering | 57700.0 | 101000.0 | 64300.0 | 161000.0 | STEM |
| 2 | Agriculture | 42600.0 | 71900.0 | 36300.0 | 150000.0 | Business |
| 3 | Anthropology | 36800.0 | 61500.0 | 33800.0 | 138000.0 | HASS |
| 4 | Architecture | 41600.0 | 76800.0 | 50600.0 | 136000.0 | Business |
df.shape
(51, 6)
df.columns
Index(['Undergraduate Major', 'Starting Median Salary',
'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary',
'Mid-Career 90th Percentile Salary', 'Group'],
dtype='object')
df.isna()
| Undergraduate Major | Starting Median Salary | Mid-Career Median Salary | Mid-Career 10th Percentile Salary | Mid-Career 90th Percentile Salary | Group | |
|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False |
| 5 | False | False | False | False | False | False |
| 6 | False | False | False | False | False | False |
| 7 | False | False | False | False | False | False |
| 8 | False | False | False | False | False | False |
| 9 | False | False | False | False | False | False |
| 10 | False | False | False | False | False | False |
| 11 | False | False | False | False | False | False |
| 12 | False | False | False | False | False | False |
| 13 | False | False | False | False | False | False |
| 14 | False | False | False | False | False | False |
| 15 | False | False | False | False | False | False |
| 16 | False | False | False | False | False | False |
| 17 | False | False | False | False | False | False |
| 18 | False | False | False | False | False | False |
| 19 | False | False | False | False | False | False |
| 20 | False | False | False | False | False | False |
| 21 | False | False | False | False | False | False |
| 22 | False | False | False | False | False | False |
| 23 | False | False | False | False | False | False |
| 24 | False | False | False | False | False | False |
| 25 | False | False | False | False | False | False |
| 26 | False | False | False | False | False | False |
| 27 | False | False | False | False | False | False |
| 28 | False | False | False | False | False | False |
| 29 | False | False | False | False | False | False |
| 30 | False | False | False | False | False | False |
| 31 | False | False | False | False | False | False |
| 32 | False | False | False | False | False | False |
| 33 | False | False | False | False | False | False |
| 34 | False | False | False | False | False | False |
| 35 | False | False | False | False | False | False |
| 36 | False | False | False | False | False | False |
| 37 | False | False | False | False | False | False |
| 38 | False | False | False | False | False | False |
| 39 | False | False | False | False | False | False |
| 40 | False | False | False | False | False | False |
| 41 | False | False | False | False | False | False |
| 42 | False | False | False | False | False | False |
| 43 | False | False | False | False | False | False |
| 44 | False | False | False | False | False | False |
| 45 | False | False | False | False | False | False |
| 46 | False | False | False | False | False | False |
| 47 | False | False | False | False | False | False |
| 48 | False | False | False | False | False | False |
| 49 | False | False | False | False | False | False |
| 50 | False | True | True | True | True | True |
df.tail()
| Undergraduate Major | Starting Median Salary | Mid-Career Median Salary | Mid-Career 10th Percentile Salary | Mid-Career 90th Percentile Salary | Group | |
|---|---|---|---|---|---|---|
| 46 | Psychology | 35900.0 | 60400.0 | 31600.0 | 127000.0 | HASS |
| 47 | Religion | 34100.0 | 52000.0 | 29700.0 | 96400.0 | HASS |
| 48 | Sociology | 36500.0 | 58200.0 | 30700.0 | 118000.0 | HASS |
| 49 | Spanish | 34000.0 | 53100.0 | 31000.0 | 96400.0 | HASS |
| 50 | Source: PayScale Inc. | NaN | NaN | NaN | NaN | NaN |
clean_df = df.dropna()
clean_df.tail()
| Undergraduate Major | Starting Median Salary | Mid-Career Median Salary | Mid-Career 10th Percentile Salary | Mid-Career 90th Percentile Salary | Group | |
|---|---|---|---|---|---|---|
| 45 | Political Science | 40800.0 | 78200.0 | 41200.0 | 168000.0 | HASS |
| 46 | Psychology | 35900.0 | 60400.0 | 31600.0 | 127000.0 | HASS |
| 47 | Religion | 34100.0 | 52000.0 | 29700.0 | 96400.0 | HASS |
| 48 | Sociology | 36500.0 | 58200.0 | 30700.0 | 118000.0 | HASS |
| 49 | Spanish | 34000.0 | 53100.0 | 31000.0 | 96400.0 | HASS |
clean_df['Starting Median Salary'].max()
74300.0
clean_df['Starting Median Salary'].idxmax()
43
clean_df['Starting Median Salary'][43]
74300.0
clean_df['Undergraduate Major'].loc[43]
'Physician Assistant'
clean_df['Undergraduate Major'][43]
'Physician Assistant'
clean_df.loc[43]
Undergraduate Major Physician Assistant Starting Median Salary 74300.0 Mid-Career Median Salary 91700.0 Mid-Career 10th Percentile Salary 66400.0 Mid-Career 90th Percentile Salary 124000.0 Group STEM Name: 43, dtype: object
print(clean_df['Mid-Career Median Salary'].max())
print(f"Index for the max mid career salary: {clean_df['Mid-Career Median Salary'].idxmax()}")
clean_df['Undergraduate Major'][8]
107000.0 Index for the max mid career salary: 8
'Chemical Engineering'
print(clean_df['Starting Median Salary'].min())
clean_df['Undergraduate Major'].loc[clean_df['Starting Median Salary'].idxmin()]
34000.0
'Spanish'
clean_df.loc[clean_df['Mid-Career Median Salary'].idxmin()]
Undergraduate Major Education Starting Median Salary 34900.0 Mid-Career Median Salary 52000.0 Mid-Career 10th Percentile Salary 29300.0 Mid-Career 90th Percentile Salary 102000.0 Group HASS Name: 18, dtype: object
# clean_df['Mid-Career 90th Percentile Salary'] - clean_df['Mid-Career 10th Percentile Salary']
# clean_df['Mid-Career 90th Percentile Salary'].subtract(clean_df['Mid-Career 10th Percentile Salary'])
spread_col = clean_df['Mid-Career 90th Percentile Salary'] - clean_df['Mid-Career 10th Percentile Salary']
clean_df.insert(1, 'Spread', spread_col)
clean_df.head()
| Undergraduate Major | Spread | Starting Median Salary | Mid-Career Median Salary | Mid-Career 10th Percentile Salary | Mid-Career 90th Percentile Salary | Group | |
|---|---|---|---|---|---|---|---|
| 0 | Accounting | 109800.0 | 46000.0 | 77100.0 | 42200.0 | 152000.0 | Business |
| 1 | Aerospace Engineering | 96700.0 | 57700.0 | 101000.0 | 64300.0 | 161000.0 | STEM |
| 2 | Agriculture | 113700.0 | 42600.0 | 71900.0 | 36300.0 | 150000.0 | Business |
| 3 | Anthropology | 104200.0 | 36800.0 | 61500.0 | 33800.0 | 138000.0 | HASS |
| 4 | Architecture | 85400.0 | 41600.0 | 76800.0 | 50600.0 | 136000.0 | Business |
low_risk = clean_df.sort_values('Spread')
low_risk[['Undergraduate Major', 'Spread']].head()
| Undergraduate Major | Spread | |
|---|---|---|
| 40 | Nursing | 50700.0 |
| 43 | Physician Assistant | 57600.0 |
| 41 | Nutrition | 65300.0 |
| 49 | Spanish | 65400.0 |
| 27 | Health Care Administration | 66400.0 |
highest_potential = clean_df.sort_values('Mid-Career 90th Percentile Salary', ascending=False)
highest_potential[['Undergraduate Major', 'Mid-Career 90th Percentile Salary']].head()
| Undergraduate Major | Mid-Career 90th Percentile Salary | |
|---|---|---|
| 17 | Economics | 210000.0 |
| 22 | Finance | 195000.0 |
| 8 | Chemical Engineering | 194000.0 |
| 37 | Math | 183000.0 |
| 44 | Physics | 178000.0 |
highest_spread = clean_df.sort_values('Spread', ascending=False)
highest_spread[['Undergraduate Major', 'Spread']].head()
| Undergraduate Major | Spread | |
|---|---|---|
| 17 | Economics | 159400.0 |
| 22 | Finance | 147800.0 |
| 37 | Math | 137800.0 |
| 36 | Marketing | 132900.0 |
| 42 | Philosophy | 132500.0 |
highest_spread = clean_df.sort_values('Mid-Career Median Salary', ascending=False)
highest_spread[['Undergraduate Major', 'Mid-Career Median Salary']].head()
| Undergraduate Major | Mid-Career Median Salary | |
|---|---|---|
| 8 | Chemical Engineering | 107000.0 |
| 12 | Computer Engineering | 105000.0 |
| 19 | Electrical Engineering | 103000.0 |
| 1 | Aerospace Engineering | 101000.0 |
| 17 | Economics | 98600.0 |
clean_df.groupby('Group').count()
| Undergraduate Major | Spread | Starting Median Salary | Mid-Career Median Salary | Mid-Career 10th Percentile Salary | Mid-Career 90th Percentile Salary | |
|---|---|---|---|---|---|---|
| Group | ||||||
| Business | 12 | 12 | 12 | 12 | 12 | 12 |
| HASS | 22 | 22 | 22 | 22 | 22 | 22 |
| STEM | 16 | 16 | 16 | 16 | 16 | 16 |
pd.options.display.float_format = '{:,.2f}'.format
clean_df.groupby('Group').mean()
| Spread | Starting Median Salary | Mid-Career Median Salary | Mid-Career 10th Percentile Salary | Mid-Career 90th Percentile Salary | |
|---|---|---|---|---|---|
| Group | |||||
| Business | 103,958.33 | 44,633.33 | 75,083.33 | 43,566.67 | 147,525.00 |
| HASS | 95,218.18 | 37,186.36 | 62,968.18 | 34,145.45 | 129,363.64 |
| STEM | 101,600.00 | 53,862.50 | 90,812.50 | 56,025.00 | 157,625.00 |